#!/usr/bin/env python
# -*- coding:utf-8 -*- 
# @Time    : 2018/11/17 18:04
# @Author  : liujiantao
# @Site    : https://blog.csdn.net/FlySky1991/article/details/80526257
# @File    : outliers_detection.py
# @Software: PyCharm

'''
目标：比较One-Class SVM、EllipticEnvelope、Isolation Forest、
LocalOutlierFactor这4种异常检测算法在相同数据集下的异常检测效果。
'''

# import numpy as np
# from scipy import stats
# import matplotlib.pyplot as plt
# import matplotlib.font_manager
#
from sklearn import svm, preprocessing

# from sklearn.covariance import EllipticEnvelope
# from sklearn.ensemble import IsolationForest
# from sklearn.neighbors import LocalOutlierFactor
#
# rng = np.random.RandomState(42)
#
# # 设置样本数量、异常样本比例，不同类样本分离度
# n_samples = 256
# outliers_fraction = 0.25
# clusters_separation = [0, 1, 2]
#
# # 定义各种异常检测方法
# classifiers = {
#     "One-Class SVM": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05, kernel='rbf', gamma=0.1),
#     "Roubst covariance": EllipticEnvelope(contamination=outliers_fraction),
#     "Isolation Forest": IsolationForest(max_samples=n_samples,
#                                         contamination=outliers_fraction,
#                                         random_state=rng),
#     "Local Outlier Factor": LocalOutlierFactor(n_neighbors=35,
#                                                contamination=outliers_fraction)}
# # 样本集各变量赋初值
# xx, yy = np.meshgrid(np.linspace(-7, 7, 100), np.linspace(-7, 7, 100))
# n_inliers = int((1 - outliers_fraction) * n_samples)
# n_outliers = int(outliers_fraction * n_samples)
# ground_truth = np.ones(n_samples, dtype=int)
# ground_truth[-n_outliers:] = -1
#
# # 在不同的样本分离度下测试异常检测效果
# for i, offset in enumerate(clusters_separation):
#     np.random.seed(42)
#     # 生成数据
#     X1 = 0.3 * np.random.randn(n_inliers // 2, 2) - offset
#     X2 = 0.3 * np.random.randn(n_inliers // 2, 2) + offset
#     X = np.r_[X1, X2]
#     X = np.r_[X, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))]
#
#     # 模型匹配
#     plt.figure(figsize=(9, 7))
#     for j, (clf_name, clf) in enumerate(classifiers.items()):
#         # 数据匹配
#         if clf_name == "Local Outlier Factor":
#             y_pred = clf.fit_predict(X)
#             scores_pred = clf.negative_outlier_factor_
#         else:
#             clf.fit(X)
#             scores_pred = clf.decision_function(X)
#             y_pred = clf.predict(X)
#         threshold = stats.scoreatpercentile(scores_pred, 100 * outliers_fraction)
#         n_errors = (y_pred != ground_truth).sum()
#
#  # 画图
# if clf_name == "Local Outlier Factor":
#     Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()])
# else:
#     Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
# Z = Z.reshape(xx.shape)
# subplot = plt.subplot(2, 2, j + 1)
# subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),
#                  cmap=plt.cm.Blues_r)
# a = subplot.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors='red')
# subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()], colors='orange')
# # 正常样本
# b = subplot.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1], c='white', s=20, edgecolor='k')
# # 异常样本
# c = subplot.scatter(X[-n_outliers:, 0], X[-n_outliers:, 1], c='black', s=20, edgecolor='k')
# subplot.axis('tight')
# subplot.legend(
#     [a.collections[0], b, c],
#     ['learned decision function', 'true inliers', 'true outliers'],
#     prop=matplotlib.font_manager.FontProperties(size=10),
#     loc='lower right')
# subplot.set_xlabel("{}.{}({})".format(j + 1, clf_name, n_errors))
# subplot.set_xlim((-7, 7))
# subplot.set_ylim((-7, 7))
# plt.subplots_adjust(0.04, 0.1, 0.96, 0.94, 0.1, 0.20)
# plt.suptitle("Outlier detection{}".format(i + 1))
#
# plt.show()
# print(123)
#
# from sklearn.datasets import load_boston
# import numpy as np
# import pandas as pd
# from sklearn import preprocessing
#
# data = load_boston()
# boston = pd.DataFrame(data.data)
# boston.columns = data['feature_names']
# boston.pop('CHAS')  # remove column
# normolized_data = preprocessing.StandardScaler().fit_transform(boston)
# boston.boxplot(sym='r', vert=False, patch_artist=True, meanline=False, showmeans=True)
#
# from sklearn import preprocessing
# from sklearn.datasets import make_blobs
#
# blobs = make_blobs(n_samples=1000, n_features=2, centers=1, cluster_std=1.5, shuffle=True, random_state=5)
# normolized_data = preprocessing.StandardScaler().fit_transform(blobs[0])
#
# out_fraction = 0.02
# nu_estimate = 0.95 * out_fraction + 0.05
# mechine_learning = svm.OneClassSVM(kernel="rbf", degree=3, gamma=1.0 / len(normolized_data), nu=nu_estimate)
# mechine_learning.fit(normolized_data)
# detection = mechine_learning.predict(normolized_data)
# outliers = np.where(detection == -1)
# regular = np.where(detection == 1)
# from matplotlib import pyplot as plt
#
# a = plt.plot(normolized_data[regular, 0], normolized_data[regular, 1], 'x', markersize=2, color="green", alpha=0.6)
# b = plt.plot(normolized_data[outliers, 0], normolized_data[outliers, 1], 'o', color='red', markersize=6)
#
from sklearn.datasets import make_blobs


def IsolationForest(X_train):
    """
    https://www.cnblogs.com/bonelee/p/7776711.html
    fit(X)
           Fit estimator.（无监督）
    predict(X)
              返回值：+1 表示正常样本， -1表示异常样本。
    decision_function(X)
              返回样本的异常评分。 值越小表示越有可能是异常样本。
    :return:
    """
    import numpy as np
    from sklearn.ensemble import IsolationForest
    from scipy import stats

    rng = np.random.RandomState(42)

    # 构造训练样本
    n_samples = 200  # 样本总数
    outliers_fraction = 0.25  # 异常样本比例
    n_inliers = int((1. - outliers_fraction) * n_samples)
    n_outliers = int(outliers_fraction * n_samples)

    X = 0.3 * rng.randn(n_inliers // 2, 2)
    # X_train = np.r_[X + 2, X - 2]  # 正常样本
    # X_train = np.r_[X_train, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))]  # 正常样本加上异常样本

    # fit the model
    clf = IsolationForest(max_samples=n_samples, random_state=rng, contamination=outliers_fraction)
    clf.fit(X_train)
    # y_pred_train = clf.predict(X_train)
    scores_pred = clf.decision_function(X_train)
    threshold = stats.scoreatpercentile(scores_pred, 100 * outliers_fraction)  # 根据训练样本中异常样本比例，得到阈值，用于绘图

    # plot the line, the samples, and the nearest vectors to the plane
    xx, yy = np.meshgrid(np.linspace(-7, 7, 50), np.linspace(-7, 7, 50))
    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    return Z


from tiancheng.base.base_helper import *

op_merge = pd.read_pickle(features_base_path + "op_merge.pkl")
data01 = op_merge[op_merge[op_hd.UID] == 10035]
# IsolationForest(data01)
outliers_detection(data01)
